Introduction

Restatment of problem

The Data

Libraries

## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:corrgram':
## 
##     panel.fill
## 
## Attaching package: 'tidyr'
## The following object is masked from 'package:magrittr':
## 
##     extract
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following object is masked from 'package:corrgram':
## 
##     baseball
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  3.0.3     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ✓ purrr   0.3.4
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x psych::%+%()       masks ggplot2::%+%()
## x psych::alpha()     masks ggplot2::alpha()
## x plyr::arrange()    masks dplyr::arrange()
## x purrr::compact()   masks plyr::compact()
## x plyr::count()      masks dplyr::count()
## x tidyr::extract()   masks magrittr::extract()
## x plyr::failwith()   masks dplyr::failwith()
## x dplyr::filter()    masks stats::filter()
## x plyr::id()         masks dplyr::id()
## x dplyr::lag()       masks stats::lag()
## x purrr::lift()      masks caret::lift()
## x plyr::mutate()     masks dplyr::mutate()
## x plyr::rename()     masks dplyr::rename()
## x purrr::set_names() masks magrittr::set_names()
## x plyr::summarise()  masks dplyr::summarise()
## x plyr::summarize()  masks dplyr::summarize()
## 
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
## 
##     arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
## The following objects are masked from 'package:psych':
## 
##     alpha, rescale
## Loading required package: vcd
## Loading required package: grid
## Loading required package: gnm
## 
## Attaching package: 'gnm'
## The following object is masked from 'package:lattice':
## 
##     barley
## 
## Attaching package: 'vcdExtra'
## The following object is masked from 'package:plotly':
## 
##     summarise
## The following object is masked from 'package:plyr':
## 
##     summarise
## The following object is masked from 'package:dplyr':
## 
##     summarise
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
## 
##     select
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: carData
## 
## Attaching package: 'carData'
## The following object is masked from 'package:vcdExtra':
## 
##     Burt
## 
## Attaching package: 'car'
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:psych':
## 
##     logit
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:psych':
## 
##     outlier
## The following object is masked from 'package:ggplot2':
## 
##     margin
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1
## 
## Attaching package: 'plotROC'
## The following object is masked from 'package:pROC':
## 
##     ggroc
## ResourceSelection 0.3-5   2019-07-22
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'mice'
## The following object is masked from 'package:aplore3':
## 
##     nhanes
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
## Loading required package: colorspace
## 
## Attaching package: 'colorspace'
## The following object is masked from 'package:pROC':
## 
##     coords
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
## 
## Attaching package: 'dataMaid'
## The following object is masked from 'package:rmarkdown':
## 
##     render
## The following object is masked from 'package:plyr':
## 
##     summarize
## The following object is masked from 'package:dplyr':
## 
##     summarize
## 
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
## 
##     transpose
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
# Utility function 
isEmpty <- function(column) {
    is.na(column) | column == 0 | column == "" | column == " " | column == "NA" | column == "na" | column == "Na" | column == "nA" | column == "NaN" 
}

Import the dataset Note: we make sure that the levels of Attrition are in the right order

CaseStudy2.data <-read.csv("../data/CaseStudy2-data.csv", header = TRUE, sep = ",", stringsAsFactors = TRUE)
summary(CaseStudy2.data)
##        ID             Age        Attrition           BusinessTravel
##  Min.   :  1.0   Min.   :18.00   No :730   Non-Travel       : 94   
##  1st Qu.:218.2   1st Qu.:30.00   Yes:140   Travel_Frequently:158   
##  Median :435.5   Median :35.00             Travel_Rarely    :618   
##  Mean   :435.5   Mean   :36.83                                     
##  3rd Qu.:652.8   3rd Qu.:43.00                                     
##  Max.   :870.0   Max.   :60.00                                     
##                                                                    
##    DailyRate                       Department  DistanceFromHome   Education    
##  Min.   : 103.0   Human Resources       : 35   Min.   : 1.000   Min.   :1.000  
##  1st Qu.: 472.5   Research & Development:562   1st Qu.: 2.000   1st Qu.:2.000  
##  Median : 817.5   Sales                 :273   Median : 7.000   Median :3.000  
##  Mean   : 815.2                                Mean   : 9.339   Mean   :2.901  
##  3rd Qu.:1165.8                                3rd Qu.:14.000   3rd Qu.:4.000  
##  Max.   :1499.0                                Max.   :29.000   Max.   :5.000  
##                                                                                
##           EducationField EmployeeCount EmployeeNumber   EnvironmentSatisfaction
##  Human Resources : 15    Min.   :1     Min.   :   1.0   Min.   :1.000          
##  Life Sciences   :358    1st Qu.:1     1st Qu.: 477.2   1st Qu.:2.000          
##  Marketing       :100    Median :1     Median :1039.0   Median :3.000          
##  Medical         :270    Mean   :1     Mean   :1029.8   Mean   :2.701          
##  Other           : 52    3rd Qu.:1     3rd Qu.:1561.5   3rd Qu.:4.000          
##  Technical Degree: 75    Max.   :1     Max.   :2064.0   Max.   :4.000          
##                                                                                
##     Gender      HourlyRate     JobInvolvement     JobLevel    
##  Female:354   Min.   : 30.00   Min.   :1.000   Min.   :1.000  
##  Male  :516   1st Qu.: 48.00   1st Qu.:2.000   1st Qu.:1.000  
##               Median : 66.00   Median :3.000   Median :2.000  
##               Mean   : 65.61   Mean   :2.723   Mean   :2.039  
##               3rd Qu.: 83.00   3rd Qu.:3.000   3rd Qu.:3.000  
##               Max.   :100.00   Max.   :4.000   Max.   :5.000  
##                                                               
##                       JobRole    JobSatisfaction  MaritalStatus MonthlyIncome  
##  Sales Executive          :200   Min.   :1.000   Divorced:191   Min.   : 1081  
##  Research Scientist       :172   1st Qu.:2.000   Married :410   1st Qu.: 2840  
##  Laboratory Technician    :153   Median :3.000   Single  :269   Median : 4946  
##  Manufacturing Director   : 87   Mean   :2.709                  Mean   : 6390  
##  Healthcare Representative: 76   3rd Qu.:4.000                  3rd Qu.: 8182  
##  Sales Representative     : 53   Max.   :4.000                  Max.   :19999  
##  (Other)                  :129                                                 
##   MonthlyRate    NumCompaniesWorked Over18  OverTime  PercentSalaryHike
##  Min.   : 2094   Min.   :0.000      Y:870   No :618   Min.   :11.0     
##  1st Qu.: 8092   1st Qu.:1.000              Yes:252   1st Qu.:12.0     
##  Median :14074   Median :2.000                        Median :14.0     
##  Mean   :14326   Mean   :2.728                        Mean   :15.2     
##  3rd Qu.:20456   3rd Qu.:4.000                        3rd Qu.:18.0     
##  Max.   :26997   Max.   :9.000                        Max.   :25.0     
##                                                                        
##  PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
##  Min.   :3.000     Min.   :1.000            Min.   :80    Min.   :0.0000  
##  1st Qu.:3.000     1st Qu.:2.000            1st Qu.:80    1st Qu.:0.0000  
##  Median :3.000     Median :3.000            Median :80    Median :1.0000  
##  Mean   :3.152     Mean   :2.707            Mean   :80    Mean   :0.7839  
##  3rd Qu.:3.000     3rd Qu.:4.000            3rd Qu.:80    3rd Qu.:1.0000  
##  Max.   :4.000     Max.   :4.000            Max.   :80    Max.   :3.0000  
##                                                                           
##  TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany  
##  Min.   : 0.00     Min.   :0.000         Min.   :1.000   Min.   : 0.000  
##  1st Qu.: 6.00     1st Qu.:2.000         1st Qu.:2.000   1st Qu.: 3.000  
##  Median :10.00     Median :3.000         Median :3.000   Median : 5.000  
##  Mean   :11.05     Mean   :2.832         Mean   :2.782   Mean   : 6.962  
##  3rd Qu.:15.00     3rd Qu.:3.000         3rd Qu.:3.000   3rd Qu.:10.000  
##  Max.   :40.00     Max.   :6.000         Max.   :4.000   Max.   :40.000  
##                                                                          
##  YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
##  Min.   : 0.000     Min.   : 0.000          Min.   : 0.00       
##  1st Qu.: 2.000     1st Qu.: 0.000          1st Qu.: 2.00       
##  Median : 3.000     Median : 1.000          Median : 3.00       
##  Mean   : 4.205     Mean   : 2.169          Mean   : 4.14       
##  3rd Qu.: 7.000     3rd Qu.: 3.000          3rd Qu.: 7.00       
##  Max.   :18.000     Max.   :15.000          Max.   :17.00       
## 
CaseStudy2.data$Attrition <- ordered(CaseStudy2.data$Attrition, levels=c("Yes", "No"))
str(CaseStudy2.data)
## 'data.frame':    870 obs. of  36 variables:
##  $ ID                      : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Age                     : int  32 40 35 32 24 27 41 37 34 34 ...
##  $ Attrition               : Ord.factor w/ 2 levels "Yes"<"No": 2 2 2 2 2 2 2 2 2 2 ...
##  $ BusinessTravel          : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 2 3 2 2 3 3 3 2 ...
##  $ DailyRate               : int  117 1308 200 801 567 294 1283 309 1333 653 ...
##  $ Department              : Factor w/ 3 levels "Human Resources",..: 3 2 2 3 2 2 2 3 3 2 ...
##  $ DistanceFromHome        : int  13 14 18 1 2 10 5 10 10 10 ...
##  $ Education               : int  4 3 2 4 1 2 5 4 4 4 ...
##  $ EducationField          : Factor w/ 6 levels "Human Resources",..: 2 4 2 3 6 2 4 2 2 6 ...
##  $ EmployeeCount           : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ EmployeeNumber          : int  859 1128 1412 2016 1646 733 1448 1105 1055 1597 ...
##  $ EnvironmentSatisfaction : int  2 3 3 3 1 4 2 4 3 4 ...
##  $ Gender                  : Factor w/ 2 levels "Female","Male": 2 2 2 1 1 2 2 1 1 2 ...
##  $ HourlyRate              : int  73 44 60 48 32 32 90 88 87 92 ...
##  $ JobInvolvement          : int  3 2 3 3 3 3 4 2 3 2 ...
##  $ JobLevel                : int  2 5 3 3 1 3 1 2 1 2 ...
##  $ JobRole                 : Factor w/ 9 levels "Healthcare Representative",..: 8 6 5 8 7 5 7 8 9 1 ...
##  $ JobSatisfaction         : int  4 3 4 4 4 1 3 4 3 3 ...
##  $ MaritalStatus           : Factor w/ 3 levels "Divorced","Married",..: 1 3 3 2 3 1 2 1 2 2 ...
##  $ MonthlyIncome           : int  4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
##  $ MonthlyRate             : int  9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
##  $ NumCompaniesWorked      : int  2 1 2 1 1 1 2 2 1 1 ...
##  $ Over18                  : Factor w/ 1 level "Y": 1 1 1 1 1 1 1 1 1 1 ...
##  $ OverTime                : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 2 2 2 1 ...
##  $ PercentSalaryHike       : int  11 14 11 19 13 21 12 14 19 14 ...
##  $ PerformanceRating       : int  3 3 3 3 3 4 3 3 3 3 ...
##  $ RelationshipSatisfaction: int  3 1 3 3 3 3 1 3 4 2 ...
##  $ StandardHours           : int  80 80 80 80 80 80 80 80 80 80 ...
##  $ StockOptionLevel        : int  1 0 0 2 0 2 0 3 1 1 ...
##  $ TotalWorkingYears       : int  8 21 10 14 6 9 7 8 1 8 ...
##  $ TrainingTimesLastYear   : int  3 2 2 3 2 4 5 5 2 3 ...
##  $ WorkLifeBalance         : int  2 4 3 3 3 2 2 3 3 2 ...
##  $ YearsAtCompany          : int  5 20 2 14 6 9 4 1 1 8 ...
##  $ YearsInCurrentRole      : int  2 7 2 10 3 7 2 0 1 2 ...
##  $ YearsSinceLastPromotion : int  0 4 2 5 1 1 0 0 0 7 ...
##  $ YearsWithCurrManager    : int  3 9 2 7 3 7 3 0 0 7 ...

3. Assess the Data

Cleanup Factors for anything that looks categorical in nature.

summary(CaseStudy2.data)
##        ID             Age        Attrition           BusinessTravel
##  Min.   :  1.0   Min.   :18.00   Yes:140   Non-Travel       : 94   
##  1st Qu.:218.2   1st Qu.:30.00   No :730   Travel_Frequently:158   
##  Median :435.5   Median :35.00             Travel_Rarely    :618   
##  Mean   :435.5   Mean   :36.83                                     
##  3rd Qu.:652.8   3rd Qu.:43.00                                     
##  Max.   :870.0   Max.   :60.00                                     
##                                                                    
##    DailyRate                       Department  DistanceFromHome   Education    
##  Min.   : 103.0   Human Resources       : 35   Min.   : 1.000   Min.   :1.000  
##  1st Qu.: 472.5   Research & Development:562   1st Qu.: 2.000   1st Qu.:2.000  
##  Median : 817.5   Sales                 :273   Median : 7.000   Median :3.000  
##  Mean   : 815.2                                Mean   : 9.339   Mean   :2.901  
##  3rd Qu.:1165.8                                3rd Qu.:14.000   3rd Qu.:4.000  
##  Max.   :1499.0                                Max.   :29.000   Max.   :5.000  
##                                                                                
##           EducationField EmployeeCount EmployeeNumber   EnvironmentSatisfaction
##  Human Resources : 15    Min.   :1     Min.   :   1.0   Min.   :1.000          
##  Life Sciences   :358    1st Qu.:1     1st Qu.: 477.2   1st Qu.:2.000          
##  Marketing       :100    Median :1     Median :1039.0   Median :3.000          
##  Medical         :270    Mean   :1     Mean   :1029.8   Mean   :2.701          
##  Other           : 52    3rd Qu.:1     3rd Qu.:1561.5   3rd Qu.:4.000          
##  Technical Degree: 75    Max.   :1     Max.   :2064.0   Max.   :4.000          
##                                                                                
##     Gender      HourlyRate     JobInvolvement     JobLevel    
##  Female:354   Min.   : 30.00   Min.   :1.000   Min.   :1.000  
##  Male  :516   1st Qu.: 48.00   1st Qu.:2.000   1st Qu.:1.000  
##               Median : 66.00   Median :3.000   Median :2.000  
##               Mean   : 65.61   Mean   :2.723   Mean   :2.039  
##               3rd Qu.: 83.00   3rd Qu.:3.000   3rd Qu.:3.000  
##               Max.   :100.00   Max.   :4.000   Max.   :5.000  
##                                                               
##                       JobRole    JobSatisfaction  MaritalStatus MonthlyIncome  
##  Sales Executive          :200   Min.   :1.000   Divorced:191   Min.   : 1081  
##  Research Scientist       :172   1st Qu.:2.000   Married :410   1st Qu.: 2840  
##  Laboratory Technician    :153   Median :3.000   Single  :269   Median : 4946  
##  Manufacturing Director   : 87   Mean   :2.709                  Mean   : 6390  
##  Healthcare Representative: 76   3rd Qu.:4.000                  3rd Qu.: 8182  
##  Sales Representative     : 53   Max.   :4.000                  Max.   :19999  
##  (Other)                  :129                                                 
##   MonthlyRate    NumCompaniesWorked Over18  OverTime  PercentSalaryHike
##  Min.   : 2094   Min.   :0.000      Y:870   No :618   Min.   :11.0     
##  1st Qu.: 8092   1st Qu.:1.000              Yes:252   1st Qu.:12.0     
##  Median :14074   Median :2.000                        Median :14.0     
##  Mean   :14326   Mean   :2.728                        Mean   :15.2     
##  3rd Qu.:20456   3rd Qu.:4.000                        3rd Qu.:18.0     
##  Max.   :26997   Max.   :9.000                        Max.   :25.0     
##                                                                        
##  PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
##  Min.   :3.000     Min.   :1.000            Min.   :80    Min.   :0.0000  
##  1st Qu.:3.000     1st Qu.:2.000            1st Qu.:80    1st Qu.:0.0000  
##  Median :3.000     Median :3.000            Median :80    Median :1.0000  
##  Mean   :3.152     Mean   :2.707            Mean   :80    Mean   :0.7839  
##  3rd Qu.:3.000     3rd Qu.:4.000            3rd Qu.:80    3rd Qu.:1.0000  
##  Max.   :4.000     Max.   :4.000            Max.   :80    Max.   :3.0000  
##                                                                           
##  TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany  
##  Min.   : 0.00     Min.   :0.000         Min.   :1.000   Min.   : 0.000  
##  1st Qu.: 6.00     1st Qu.:2.000         1st Qu.:2.000   1st Qu.: 3.000  
##  Median :10.00     Median :3.000         Median :3.000   Median : 5.000  
##  Mean   :11.05     Mean   :2.832         Mean   :2.782   Mean   : 6.962  
##  3rd Qu.:15.00     3rd Qu.:3.000         3rd Qu.:3.000   3rd Qu.:10.000  
##  Max.   :40.00     Max.   :6.000         Max.   :4.000   Max.   :40.000  
##                                                                          
##  YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
##  Min.   : 0.000     Min.   : 0.000          Min.   : 0.00       
##  1st Qu.: 2.000     1st Qu.: 0.000          1st Qu.: 2.00       
##  Median : 3.000     Median : 1.000          Median : 3.00       
##  Mean   : 4.205     Mean   : 2.169          Mean   : 4.14       
##  3rd Qu.: 7.000     3rd Qu.: 3.000          3rd Qu.: 7.00       
##  Max.   :18.000     Max.   :15.000          Max.   :17.00       
## 
CaseStudy2.data$PerformanceRating <- as.factor(CaseStudy2.data$PerformanceRating)
CaseStudy2.data$RelationshipSatisfaction <- as.factor(CaseStudy2.data$RelationshipSatisfaction)
CaseStudy2.data$StockOptionLevel <- as.factor(CaseStudy2.data$StockOptionLevel)
CaseStudy2.data$TrainingTimesLastYear <- as.factor(CaseStudy2.data$TrainingTimesLastYear)
CaseStudy2.data$EnvironmentSatisfaction <- as.factor(CaseStudy2.data$EnvironmentSatisfaction)
CaseStudy2.data$WorkLifeBalance <- as.factor(CaseStudy2.data$WorkLifeBalance)
CaseStudy2.data$Education <- as.factor(CaseStudy2.data$Education)
CaseStudy2.data$PerformanceRating <- as.factor(CaseStudy2.data$PerformanceRating)
CaseStudy2.data$RelationshipSatisfaction <- as.factor(CaseStudy2.data$RelationshipSatisfaction)
CaseStudy2.data$StockOptionLevel <- as.factor(CaseStudy2.data$StockOptionLevel)
CaseStudy2.data$TrainingTimesLastYear <- as.factor(CaseStudy2.data$TrainingTimesLastYear)
CaseStudy2.data$EnvironmentSatisfaction <- as.factor(CaseStudy2.data$EnvironmentSatisfaction)
CaseStudy2.data$JobInvolvement <- as.factor(CaseStudy2.data$JobInvolvement)
CaseStudy2.data$JobSatisfaction <- as.factor(CaseStudy2.data$JobSatisfaction)
CaseStudy2.data$JobLevel <- as.factor(CaseStudy2.data$JobLevel)

Missing data - There is none

# Plot missing data (there should be none)
bdat_mice_clean <- aggr(CaseStudy2.data, col=c('navyblue','yellow'),
                    numbers=TRUE, sortVars=TRUE,
                    labels=names(CaseStudy2.data), cex.axis=.7,
                    gap=3, ylab=c("Missing Data (distribution)","Missing Data (Pattern)"))

## 
##  Variables sorted by number of missings: 
##                  Variable Count
##                        ID     0
##                       Age     0
##                 Attrition     0
##            BusinessTravel     0
##                 DailyRate     0
##                Department     0
##          DistanceFromHome     0
##                 Education     0
##            EducationField     0
##             EmployeeCount     0
##            EmployeeNumber     0
##   EnvironmentSatisfaction     0
##                    Gender     0
##                HourlyRate     0
##            JobInvolvement     0
##                  JobLevel     0
##                   JobRole     0
##           JobSatisfaction     0
##             MaritalStatus     0
##             MonthlyIncome     0
##               MonthlyRate     0
##        NumCompaniesWorked     0
##                    Over18     0
##                  OverTime     0
##         PercentSalaryHike     0
##         PerformanceRating     0
##  RelationshipSatisfaction     0
##             StandardHours     0
##          StockOptionLevel     0
##         TotalWorkingYears     0
##     TrainingTimesLastYear     0
##           WorkLifeBalance     0
##            YearsAtCompany     0
##        YearsInCurrentRole     0
##   YearsSinceLastPromotion     0
##      YearsWithCurrManager     0

DataCleanup

Remove the ID data. Additionally, remove any field which has ONLY 1 value for this exercise.

CaseStudy2.data.NoIdOrUselessData <- CaseStudy2.data %>% dplyr::select(-c("ID", "EmployeeCount", "Over18", "StandardHours"))

Test Splits

Split the data randomly 85% Train data, 15% Test data

set.seed(223)
trainIndex <- createDataPartition(CaseStudy2.data.NoIdOrUselessData$Attrition, p = .80, list = FALSE, times = 1)
CaseStudy2.dtrain <- CaseStudy2.data.NoIdOrUselessData[trainIndex,]
CaseStudy2.dtest <- CaseStudy2.data.NoIdOrUselessData[-trainIndex,]

Note: We assume that this data does not need to be corrected for serial correlation/autocorrelation. In truth, there may be some serial correlation with employees that were hired later or hired earlier, or just in what year there was more atrition. There are many other possible confounding factors such as the economy that we will ignore for the sake of this analysis.

Utility Functions for Confusion Matrix With Custom threshold.

confusionMatrixForCustomThreshold <- function(model, data, threshold, probabilities=NULL) {
  if(is.null(probabilities)) {
    probabilities <- predict(model, newdata = data, type = "prob")    
  }
  
  preds2 <- factor(ifelse(probabilities$Yes >= threshold,"Yes", "No"), levels=c("Yes","No"))
  CM.Train <- confusionMatrix(preds2, data$Attrition)
  return(CM.Train)
} 

plotConfusionMatrixByThreshold <- function(model, data, testTitle) {
  thresholdSequence <- seq(0, 1, by = 0.001)
  accuracy<-c()
  sensitivities<-c()
  specificities<-c()
  probabilities <- predict(model, newdata = data, type = "prob")
  for(i in 1:length(thresholdSequence)) {
    confusionMatrix <- confusionMatrixForCustomThreshold(model, data, thresholdSequence[i], probabilities)
    accuracy[i] <- unname(confusionMatrix$overall['Accuracy'])
    sensitivities[i] <- unname(confusionMatrix$byClass['Sensitivity'])
    specificities[i] <- unname(confusionMatrix$byClass['Specificity'])
  }
  
  plot(x=thresholdSequence, y=accuracy,lty=2,lwd=2,col="red", xlab="threshold", ylab="probability", main=testTitle)
  points(x=thresholdSequence, y=sensitivities, col="green")
  points(x=thresholdSequence, y=specificities, col="blue")
  legend("bottomright", legend=c("Accuracy", "Sensitivity", "Specificity"),
         col=c("red", "green", "blue"), lty=1:2, cex=0.8)
}

3. EDA

Note: We have an imbalanced dataset. There are FAR more “No” entries than “Yes” entries for attrition. Further we can see that there is multicolinearity between some of the variables. More on this below.

CaseStudy2.data.NoIdOrUselessData %>% ggplot(aes(x=Attrition, fill = Attrition)) + geom_bar() + ggtitle("Attrition Count")

ggpairs(CaseStudy2.data.NoIdOrUselessData,columns=1:6,aes(colour=Attrition), legend =1, progress = FALSE) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggpairs(CaseStudy2.data.NoIdOrUselessData,columns=7:11,aes(colour=Attrition), legend =1, progress = FALSE) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggpairs(CaseStudy2.data.NoIdOrUselessData,columns=12:17,aes(colour=Attrition), legend =1, progress = FALSE) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggpairs(CaseStudy2.data.NoIdOrUselessData,columns=18:22,aes(colour=Attrition), legend =1, progress = FALSE) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggpairs(CaseStudy2.data.NoIdOrUselessData,columns=23:32,aes(colour=Attrition), legend =1, progress = FALSE) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

PCA

To see whether or not there is any seperation to begin with (amongs the continuous variables in our dataset, we perform PCA)

# PCA will only really work for numerical variables, select only numerical variables. 
# Ignore Employee Count, it is always 1. Standard hours is ALWAYS 80. We can ignore that too. 
reduced.numerical <- CaseStudy2.data.NoIdOrUselessData %>% dplyr::select(Age, DailyRate, DistanceFromHome, EmployeeNumber, HourlyRate, MonthlyIncome, MonthlyRate, NumCompaniesWorked, PercentSalaryHike, TotalWorkingYears, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager)
pc.result<-prcomp(reduced.numerical, center= TRUE, scale. = TRUE)
summary(pc.result)
## Importance of components:
##                           PC1    PC2     PC3    PC4    PC5     PC6     PC7
## Standard deviation     2.0059 1.2956 1.05657 1.0213 1.0165 0.98226 0.97339
## Proportion of Variance 0.2874 0.1199 0.07974 0.0745 0.0738 0.06892 0.06768
## Cumulative Proportion  0.2874 0.4073 0.48706 0.5616 0.6354 0.70428 0.77195
##                            PC8     PC9    PC10    PC11   PC12    PC13    PC14
## Standard deviation     0.94497 0.82918 0.72764 0.69627 0.5331 0.43047 0.35827
## Proportion of Variance 0.06378 0.04911 0.03782 0.03463 0.0203 0.01324 0.00917
## Cumulative Proportion  0.83574 0.88485 0.92267 0.95730 0.9776 0.99083 1.00000
pc.result
## Standard deviations (1, .., p=14):
##  [1] 2.0059450 1.2956270 1.0565657 1.0212669 1.0164890 0.9822607 0.9733940
##  [8] 0.9449691 0.8291810 0.7276433 0.6962723 0.5331140 0.4304664 0.3582698
## 
## Rotation (n x k) = (14 x 14):
##                                  PC1         PC2         PC3          PC4
## Age                      0.268657604 -0.46969559  0.03801522  0.004793867
## DailyRate               -0.015632448 -0.08607082  0.47290417 -0.244349796
## DistanceFromHome        -0.009042931  0.01006789  0.53123059  0.431806807
## EmployeeNumber           0.020613595 -0.01516990 -0.28143762  0.602566807
## HourlyRate               0.009607752 -0.06470642  0.46587922  0.461291620
## MonthlyIncome            0.351601205 -0.30812876 -0.01319624 -0.002860960
## MonthlyRate              0.021193417 -0.11844544 -0.27625186  0.347283050
## NumCompaniesWorked       0.022117913 -0.56513974 -0.03939699 -0.130748171
## PercentSalaryHike       -0.044458408  0.02564669  0.33753827 -0.192364833
## TotalWorkingYears        0.419495870 -0.31386835  0.01535436 -0.012617636
## YearsAtCompany           0.444574253  0.21971029  0.01746146 -0.021384145
## YearsInCurrentRole       0.396199671  0.26247011  0.04378918 -0.028553516
## YearsSinceLastPromotion  0.350788132  0.19964615 -0.03554482  0.025563539
## YearsWithCurrManager     0.383134246  0.28794274  0.02765768 -0.033735173
##                                 PC5         PC6         PC7          PC8
## Age                     -0.04592299 -0.07558154 -0.01097235 -0.024213353
## DailyRate                0.21979058  0.73218168  0.06161879  0.330659252
## DistanceFromHome        -0.29716545 -0.33498780 -0.09203440  0.546351080
## EmployeeNumber           0.09095769  0.40608518 -0.61853786 -0.001755972
## HourlyRate               0.38510061 -0.02808084  0.27629416 -0.579177883
## MonthlyIncome           -0.06040673 -0.03702276 -0.03223729  0.050688038
## MonthlyRate             -0.48506408  0.36752611  0.63293946  0.031555809
## NumCompaniesWorked       0.09485781 -0.01127824 -0.11796538 -0.045494002
## PercentSalaryHike       -0.67297948  0.16823587 -0.33247393 -0.499254233
## TotalWorkingYears       -0.03053411 -0.05581647 -0.02981403  0.014846964
## YearsAtCompany           0.01029127  0.01568290 -0.01020871 -0.009857063
## YearsInCurrentRole      -0.02222431  0.09280639  0.02815357 -0.014082742
## YearsSinceLastPromotion  0.04139600 -0.02169833  0.02586783 -0.021490465
## YearsWithCurrManager     0.02487533  0.05445161 -0.01111947 -0.016163807
##                                  PC9         PC10         PC11         PC12
## Age                     -0.199671381 -0.310612359  0.704909667  0.031703624
## DailyRate               -0.056423087 -0.082806448 -0.017912911 -0.029974271
## DistanceFromHome         0.150494611  0.016873575 -0.014730989 -0.008062339
## EmployeeNumber          -0.016537886 -0.006804817  0.027588072  0.011499996
## HourlyRate              -0.005154642  0.046212685 -0.052634484 -0.001143078
## MonthlyIncome           -0.427504712  0.217762765 -0.541763959 -0.007841276
## MonthlyRate              0.111705888  0.023929120 -0.010011862 -0.045563589
## NumCompaniesWorked       0.757355597  0.146377660 -0.138581967 -0.009590849
## PercentSalaryHike        0.010275009 -0.048174904 -0.046362033 -0.028479294
## TotalWorkingYears       -0.147735635  0.048416249 -0.104994783 -0.039311910
## YearsAtCompany           0.008429314  0.077395487  0.005471988 -0.001417094
## YearsInCurrentRole       0.212843634  0.221997633  0.133424346  0.746451728
## YearsSinceLastPromotion  0.247975978 -0.798238146 -0.302495624 -0.100200306
## YearsWithCurrManager     0.190725312  0.357287941  0.252409719 -0.652747705
##                                  PC13          PC14
## Age                      0.1633557935  0.1904632816
## DailyRate               -0.0145624378  0.0002688141
## DistanceFromHome         0.0003815934  0.0210793918
## EmployeeNumber          -0.0044771135 -0.0068717557
## HourlyRate               0.0039682314  0.0078256926
## MonthlyIncome            0.4104839924  0.2835049859
## MonthlyRate             -0.0309552690  0.0210919371
## NumCompaniesWorked      -0.0103578652  0.1481613000
## PercentSalaryHike       -0.0066217009 -0.0004830507
## TotalWorkingYears       -0.4468076482 -0.6954001992
## YearsAtCompany          -0.6389702137  0.5817127252
## YearsInCurrentRole       0.2643059279 -0.1478828957
## YearsSinceLastPromotion  0.1711834086 -0.0483017202
## YearsWithCurrManager     0.3104497107 -0.1192701687
pc.scores<-data.frame(pc.result$x)
pc.scores$Attrition= as.factor(CaseStudy2.data$Attrition)

plot_ly(x = pc.scores$PC1, y = pc.scores$PC2, z = pc.scores$PC3, type="scatter3d", mode="markers", color=pc.scores$Attrition, title="Principal Components 1-3")
## Warning: 'scatter3d' objects don't have these attributes: 'title'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'meta', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'x', 'y', 'z', 'text', 'texttemplate', 'hovertext', 'hovertemplate', 'mode', 'surfaceaxis', 'surfacecolor', 'projection', 'connectgaps', 'line', 'marker', 'textposition', 'textfont', 'hoverinfo', 'error_x', 'error_y', 'error_z', 'xcalendar', 'ycalendar', 'zcalendar', 'scene', 'idssrc', 'customdatasrc', 'metasrc', 'xsrc', 'ysrc', 'zsrc', 'textsrc', 'texttemplatesrc', 'hovertextsrc', 'hovertemplatesrc', 'textpositionsrc', 'hoverinfosrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'

## Warning: 'scatter3d' objects don't have these attributes: 'title'
## Valid attributes include:
## 'type', 'visible', 'showlegend', 'legendgroup', 'opacity', 'name', 'uid', 'ids', 'customdata', 'meta', 'hoverlabel', 'stream', 'transforms', 'uirevision', 'x', 'y', 'z', 'text', 'texttemplate', 'hovertext', 'hovertemplate', 'mode', 'surfaceaxis', 'surfacecolor', 'projection', 'connectgaps', 'line', 'marker', 'textposition', 'textfont', 'hoverinfo', 'error_x', 'error_y', 'error_z', 'xcalendar', 'ycalendar', 'zcalendar', 'scene', 'idssrc', 'customdatasrc', 'metasrc', 'xsrc', 'ysrc', 'zsrc', 'textsrc', 'texttemplatesrc', 'hovertextsrc', 'hovertemplatesrc', 'textpositionsrc', 'hoverinfosrc', 'key', 'set', 'frame', 'transforms', '_isNestedKey', '_isSimpleKey', '_isGraticule', '_bbox'
ggpairs(pc.scores,columns=1:5,aes(colour=Attrition), legend =1, , progress = FALSE) 

## Cluster and HeatMap This HeatMap is used to show similarities between data with a non-parameteric hierarchical cluster. Our distance metric is still euclidean, so the clustering of categorical variables may not be totally accurate.

pheatmap::pheatmap(data.matrix(CaseStudy2.data.NoIdOrUselessData), scale = "column")

## Multicolinearity We can see that there are high GVIF values for certain predictors in our model. GVIF is used so that we may include categorical variables as well. There is conflicting information on what to do with GVIF (rules of thumb). We will just keep the high VIF values in mind when building plots to show multicolinarity below.

CaseStudy2.data.NoIdOrUselessData.Copy <- data.frame(CaseStudy2.data.NoIdOrUselessData)
CaseStudy2.data.NoIdOrUselessData.Copy$AgeDummy <- CaseStudy2.data.NoIdOrUselessData.Copy$Age
data.table(vif(lm(AgeDummy~., data=CaseStudy2.data.NoIdOrUselessData.Copy)))
## Warning in summary.lm(object, ...): essentially perfect fit: summary may be
## unreliable
##            GVIF Df GVIF^(1/(2*Df))
##  1:    2.060730  1        1.435524
##  2:    1.549002  1        1.244589
##  3:    1.185951  2        1.043558
##  4:    1.076613  1        1.037599
##  5:  121.573870  2        3.320550
##  6:    1.113235  1        1.055099
##  7:    1.508926  4        1.052770
##  8:    3.473377  5        1.132596
##  9:    1.096800  1        1.047282
## 10:    1.263808  3        1.039793
## 11:    1.065564  1        1.032261
## 12:    1.075300  1        1.036967
## 13:    1.304695  3        1.045325
## 14:   61.766716  4        1.674343
## 15: 2194.770794  8        1.617472
## 16:    1.286452  3        1.042875
## 17:    3.204596  2        1.337961
## 18:   22.471085  1        4.740368
## 19:    1.088645  1        1.043382
## 20:    1.445871  1        1.202444
## 21:    1.176766  1        1.084789
## 22:    2.709157  1        1.645952
## 23:    2.689863  1        1.640080
## 24:    1.263835  3        1.039797
## 25:    3.534365  3        1.234199
## 26:    5.936916  1        2.436579
## 27:    1.561017  6        1.037809
## 28:    1.243624  3        1.037007
## 29:    5.461741  1        2.337037
## 30:    3.113613  1        1.764543
## 31:    1.959381  1        1.399779
## 32:    2.922524  1        1.709539
##            GVIF Df GVIF^(1/(2*Df))

Factor Analysis:

Variable Importance After further analysis, we can see that, when predicting Attrition, that Random Forests seem to perform the best for the models we’ve tried. Since Random Forest performs the best for our Attrition Classification models, we will use it’s ranking of variable importance to guide our comparison of mutivariate (categorical and numerical) variable importance.

ctrl <- trainControl(method="cv", summaryFunction=twoClassSummary, classProbs=T, savePredictions = T, number = 5)
fit2.HigherComplexity.Rf <- train(Attrition ~ ., data = CaseStudy2.data.NoIdOrUselessData, method = 'rf', trControl = ctrl, ntree = 200, metric = "ROC")

fit2.HigherComplexity.Rf$metric
## [1] "ROC"
fit2.HigherComplexity.Rf
## Random Forest 
## 
## 870 samples
##  31 predictor
##   2 classes: 'Yes', 'No' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 696, 696, 696, 696, 696 
## Resampling results across tuning parameters:
## 
##   mtry  ROC        Sens        Spec     
##    2    0.7656800  0.03571429  1.0000000
##   35    0.7785959  0.22142857  0.9780822
##   68    0.7587573  0.25714286  0.9767123
## 
## ROC was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 35.
# Train Confusion Matrix
plotConfusionMatrixByThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtrain, "Train Confusion Matrix Stats - Random Forest")

threshold <- .4
print("Train Confusion Matrix")
## [1] "Train Confusion Matrix"
print(paste("Threshold was:", threshold))
## [1] "Threshold was: 0.4"
confusionMatrixForCustomThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtrain, threshold)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Yes  No
##        Yes 112   0
##        No    0 584
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9947, 1)
##     No Information Rate : 0.8391     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.1609     
##          Detection Rate : 0.1609     
##    Detection Prevalence : 0.1609     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : Yes        
## 
# Test Confusion Matrix
plotConfusionMatrixByThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtest, "Test Confusion Matrix Stats - Random Forest")

print("Test Confusion Matrix")
## [1] "Test Confusion Matrix"
print(paste("Threshold was:", threshold))
## [1] "Threshold was: 0.4"
confusionMatrixForCustomThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtest, threshold)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Yes  No
##        Yes  28   0
##        No    0 146
##                                     
##                Accuracy : 1         
##                  95% CI : (0.979, 1)
##     No Information Rate : 0.8391    
##     P-Value [Acc > NIR] : 5.519e-14 
##                                     
##                   Kappa : 1         
##                                     
##  Mcnemar's Test P-Value : NA        
##                                     
##             Sensitivity : 1.0000    
##             Specificity : 1.0000    
##          Pos Pred Value : 1.0000    
##          Neg Pred Value : 1.0000    
##              Prevalence : 0.1609    
##          Detection Rate : 0.1609    
##    Detection Prevalence : 0.1609    
##       Balanced Accuracy : 1.0000    
##                                     
##        'Positive' Class : Yes       
## 
summary(fit2.HigherComplexity.Rf$finalModel)
##                 Length Class      Mode     
## call               5   -none-     call     
## type               1   -none-     character
## predicted        870   factor     numeric  
## err.rate         600   -none-     numeric  
## confusion          6   -none-     numeric  
## votes           1740   matrix     numeric  
## oob.times        870   -none-     numeric  
## classes            2   -none-     character
## importance        68   -none-     numeric  
## importanceSD       0   -none-     NULL     
## localImportance    0   -none-     NULL     
## proximity          0   -none-     NULL     
## ntree              1   -none-     numeric  
## mtry               1   -none-     numeric  
## forest            14   -none-     list     
## y                870   ordered    numeric  
## test               0   -none-     NULL     
## inbag              0   -none-     NULL     
## xNames            68   -none-     character
## problemType        1   -none-     character
## tuneValue          1   data.frame list     
## obsLevels          2   -none-     character
## param              1   -none-     list
# ROC Plot
res <- evalm(fit2.HigherComplexity.Rf,gnames=c('Train'), title='ROC: RandomForest') 
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 870
## Number of groups: 1
## Observations per group: 870
## Positive: No
## Negative: Yes
## Group: Train
## Positive: 730
## Negative: 140
## ***Performance Metrics***

## Train Optimal Informedness = 0.421624266144814
## Train AUC-ROC = 0.77

plot(varImp(fit2.HigherComplexity.Rf, scale = FALSE), top = 8, main="Variable Importance: Random Forest")

Variable Importance MutliColinarity Analysis and Further EDA

Here we plot many of the “top” variables against one another to show multicolinearity between some of the different variables. Notice that ‘MonthlyIncome’ is highly colinear with ‘TotalWorkingYears’. It is also highly multicolinear with ‘YearsAtCompany’. As such, we can pick MonthlyIncome and leave out TotalWorkingYears as well as YearsAtCompany from the “top factors” list.

Top10VariableData <- CaseStudy2.data.NoIdOrUselessData %>% dplyr::select(Attrition, MonthlyIncome, OverTime, Age, JobRole, DailyRate, TotalWorkingYears, EmployeeNumber, MonthlyRate, DistanceFromHome)
ggpairs(Top10VariableData,aes(colour=Attrition), columns=2:9, legend =1, progress = FALSE) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

CaseStudy2.data.NoIdOrUselessData %>% ggplot(aes(x=Age, y=MonthlyIncome, colour = Attrition)) + geom_point() + geom_smooth() + ggtitle("MonthlyIncome vs. Age")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

CaseStudy2.data.NoIdOrUselessData %>% ggplot(aes(x=TotalWorkingYears, y=MonthlyIncome, colour = Attrition)) + geom_point() + geom_smooth() + ggtitle("MonthlyIncome vs. TotalWorkingYears")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

CaseStudy2.data.NoIdOrUselessData %>% ggplot(aes(x=YearsAtCompany, y=MonthlyIncome, colour = Attrition)) + geom_point() + geom_smooth() + ggtitle("MonthlyIncome vs. YearsAtCompany")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

CaseStudy2.data.NoIdOrUselessData %>% ggplot(aes(x=DailyRate, y=MonthlyIncome, colour = Attrition)) + geom_point() + geom_smooth() + ggtitle("MonthlyIncome vs. DailyRate")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

CaseStudy2.data.NoIdOrUselessData %>% ggplot(aes(x=MonthlyRate, y=MonthlyIncome, colour = Attrition)) + geom_point() + geom_smooth() + ggtitle("MonthlyIncome vs. MonthlyRate")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

CaseStudy2.data.NoIdOrUselessData %>% ggplot(aes(x=HourlyRate, y=MonthlyRate, colour = Attrition)) + geom_point() + geom_smooth() + ggtitle("MonthlyRate vs. HourlyRate")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Factor Analysis:

Variable Importance, Multi-colinearity corrected. Now that we know which continuous variables we can remove due to multi-colinearity, let’s re-fit the random forest and look at variable importance a second time.

ctrl <- trainControl(method="cv", summaryFunction=twoClassSummary, classProbs=T, savePredictions = T, number = 5)
fit2.HigherComplexity.Rf <- train(Attrition ~ . -I(TotalWorkingYears) -I(YearsAtCompany), data = CaseStudy2.data.NoIdOrUselessData, method = 'rf', trControl = ctrl, ntree = 200, metric = "ROC")

fit2.HigherComplexity.Rf$metric
## [1] "ROC"
# Train Confusion Matrix
plotConfusionMatrixByThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtrain, "Train Confusion Matrix Stats - Random Forest")

threshold <- .4
print("Train Confusion Matrix")
## [1] "Train Confusion Matrix"
print(paste("Threshold was:", threshold))
## [1] "Threshold was: 0.4"
confusionMatrixForCustomThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtrain, threshold)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Yes  No
##        Yes 112   0
##        No    0 584
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9947, 1)
##     No Information Rate : 0.8391     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.1609     
##          Detection Rate : 0.1609     
##    Detection Prevalence : 0.1609     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : Yes        
## 
# Test Confusion Matrix
plotConfusionMatrixByThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtest, "Test Confusion Matrix Stats - Random Forest")

print("Test Confusion Matrix")
## [1] "Test Confusion Matrix"
print(paste("Threshold was:", threshold))
## [1] "Threshold was: 0.4"
confusionMatrixForCustomThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtest, threshold)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Yes  No
##        Yes  28   0
##        No    0 146
##                                     
##                Accuracy : 1         
##                  95% CI : (0.979, 1)
##     No Information Rate : 0.8391    
##     P-Value [Acc > NIR] : 5.519e-14 
##                                     
##                   Kappa : 1         
##                                     
##  Mcnemar's Test P-Value : NA        
##                                     
##             Sensitivity : 1.0000    
##             Specificity : 1.0000    
##          Pos Pred Value : 1.0000    
##          Neg Pred Value : 1.0000    
##              Prevalence : 0.1609    
##          Detection Rate : 0.1609    
##    Detection Prevalence : 0.1609    
##       Balanced Accuracy : 1.0000    
##                                     
##        'Positive' Class : Yes       
## 
summary(fit2.HigherComplexity.Rf$finalModel)
##                 Length Class      Mode     
## call               5   -none-     call     
## type               1   -none-     character
## predicted        870   factor     numeric  
## err.rate         600   -none-     numeric  
## confusion          6   -none-     numeric  
## votes           1740   matrix     numeric  
## oob.times        870   -none-     numeric  
## classes            2   -none-     character
## importance        68   -none-     numeric  
## importanceSD       0   -none-     NULL     
## localImportance    0   -none-     NULL     
## proximity          0   -none-     NULL     
## ntree              1   -none-     numeric  
## mtry               1   -none-     numeric  
## forest            14   -none-     list     
## y                870   ordered    numeric  
## test               0   -none-     NULL     
## inbag              0   -none-     NULL     
## xNames            68   -none-     character
## problemType        1   -none-     character
## tuneValue          1   data.frame list     
## obsLevels          2   -none-     character
## param              1   -none-     list
# ROC Plot
res <- evalm(fit2.HigherComplexity.Rf,gnames=c('Train'), title='ROC: RandomForest') 
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 870
## Number of groups: 1
## Observations per group: 870
## Positive: No
## Negative: Yes
## Group: Train
## Positive: 730
## Negative: 140
## ***Performance Metrics***

## Train Optimal Informedness = 0.454500978473581
## Train AUC-ROC = 0.78

plot(varImp(fit2.HigherComplexity.Rf, scale = FALSE), top = 5, main="Variable Importance: Random Forest")

Classification

Random Forest

We now classify with Random Forest with ALl variables. Since we only care about prediction and not interpretation, we can just use all the multicolinear variables and not worry about it. Note: we tune the model to use a threshold of 0.38 to account for the imbalance in our dataset.

ctrl <- trainControl(method="cv", summaryFunction=twoClassSummary, classProbs=T, savePredictions = T, number = 10)
fit2.HigherComplexity.Rf <- train(Attrition ~ ., data = CaseStudy2.dtrain, method = 'rf', trControl = ctrl)
## Warning in train.default(x, y, weights = w, ...): The metric "Accuracy" was not
## in the result set. ROC will be used instead.
# Train Confusion Matrix
plotConfusionMatrixByThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtrain, "Train Confusion Matrix Stats - Random Forest")

threshold <- .145
print("Train Confusion Matrix")
## [1] "Train Confusion Matrix"
print(paste("Threshold was:", threshold))
## [1] "Threshold was: 0.145"
confusionMatrixForCustomThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtrain, threshold)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Yes  No
##        Yes 112  24
##        No    0 560
##                                           
##                Accuracy : 0.9655          
##                  95% CI : (0.9491, 0.9778)
##     No Information Rate : 0.8391          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.8825          
##                                           
##  Mcnemar's Test P-Value : 2.668e-06       
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.9589          
##          Pos Pred Value : 0.8235          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.1609          
##          Detection Rate : 0.1609          
##    Detection Prevalence : 0.1954          
##       Balanced Accuracy : 0.9795          
##                                           
##        'Positive' Class : Yes             
## 
# Test Confusion Matrix
plotConfusionMatrixByThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtest, "Test Confusion Matrix Stats - Random Forest")

print("Test Confusion Matrix")
## [1] "Test Confusion Matrix"
print(paste("Threshold was:", threshold))
## [1] "Threshold was: 0.145"
confusionMatrixForCustomThreshold(fit2.HigherComplexity.Rf, CaseStudy2.dtest, threshold)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Yes No
##        Yes  20 56
##        No    8 90
##                                           
##                Accuracy : 0.6322          
##                  95% CI : (0.5559, 0.7039)
##     No Information Rate : 0.8391          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1954          
##                                           
##  Mcnemar's Test P-Value : 4.228e-09       
##                                           
##             Sensitivity : 0.7143          
##             Specificity : 0.6164          
##          Pos Pred Value : 0.2632          
##          Neg Pred Value : 0.9184          
##              Prevalence : 0.1609          
##          Detection Rate : 0.1149          
##    Detection Prevalence : 0.4368          
##       Balanced Accuracy : 0.6654          
##                                           
##        'Positive' Class : Yes             
## 
summary(fit2.HigherComplexity.Rf$finalModel)
##                 Length Class      Mode     
## call               4   -none-     call     
## type               1   -none-     character
## predicted        696   factor     numeric  
## err.rate        1500   -none-     numeric  
## confusion          6   -none-     numeric  
## votes           1392   matrix     numeric  
## oob.times        696   -none-     numeric  
## classes            2   -none-     character
## importance        68   -none-     numeric  
## importanceSD       0   -none-     NULL     
## localImportance    0   -none-     NULL     
## proximity          0   -none-     NULL     
## ntree              1   -none-     numeric  
## mtry               1   -none-     numeric  
## forest            14   -none-     list     
## y                696   ordered    numeric  
## test               0   -none-     NULL     
## inbag              0   -none-     NULL     
## xNames            68   -none-     character
## problemType        1   -none-     character
## tuneValue          1   data.frame list     
## obsLevels          2   -none-     character
## param              0   -none-     list
# ROC Plot
res <- evalm(fit2.HigherComplexity.Rf,gnames=c('Train'), title='ROC: RandomForest') 
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 696
## Number of groups: 1
## Observations per group: 696
## Positive: No
## Negative: Yes
## Group: Train
## Positive: 584
## Negative: 112
## ***Performance Metrics***

## Train Optimal Informedness = 0.452788649706458
## Train AUC-ROC = 0.78

pred2 <- predict(fit2.HigherComplexity.Rf, newdata = CaseStudy2.dtest, type="prob")
test1 <- evalm(data.frame(pred2, CaseStudy2.dtest$Attrition),  title='Test ROC: Logistic Regression-LASSO')
## ***MLeval: Machine Learning Model Evaluation***
## Input: data frame of probabilities of observed labels
## Group does not exist, making column.
## Observations: 174
## Number of groups: 1
## Observations per group: 174
## Positive: No
## Negative: Yes
## Group: Group1
## Positive: 146
## Negative: 28
## ***Performance Metrics***

## Group1 Optimal Informedness = 0.37426614481409
## Group1 AUC-ROC = 0.7

Logistic Regression with LASSO for variable selection (AND prediction)

Note it appears that Logistic Regression with LASSO L1-regularization is absolutely outperformed by RandomForests.

## Logistic Regression: Fit Model - 
ctrl <- trainControl(method="cv", summaryFunction=twoClassSummary, classProbs=T, savePredictions = T, number = 5, search="grid")
fit2.HigherComplexity.Lasso <- train(Attrition ~ (.)^2 + I(Age^2) + I(DailyRate^2)+ I(DistanceFromHome^2)+ I(EmployeeNumber^2)+ I(HourlyRate^2)+ I(MonthlyIncome^2)+ I(MonthlyRate^2)+ I(NumCompaniesWorked^2)+ I(PercentSalaryHike^2) + I(TotalWorkingYears^2) + I(YearsAtCompany^2), data = CaseStudy2.dtrain, method = 'glmnet', trControl = ctrl, metric = "ROC", tuneGrid = data.frame(alpha = 1, lambda = 10^seq(-2, -1.75, by = 0.0001)))

summary(fit2.HigherComplexity.Lasso)
##             Length Class      Mode     
## a0             100 -none-     numeric  
## beta        227100 dgCMatrix  S4       
## df             100 -none-     numeric  
## dim              2 -none-     numeric  
## lambda         100 -none-     numeric  
## dev.ratio      100 -none-     numeric  
## nulldev          1 -none-     numeric  
## npasses          1 -none-     numeric  
## jerr             1 -none-     numeric  
## offset           1 -none-     logical  
## classnames       2 -none-     character
## call             5 -none-     call     
## nobs             1 -none-     numeric  
## lambdaOpt        1 -none-     numeric  
## xNames        2271 -none-     character
## problemType      1 -none-     character
## tuneValue        2 data.frame list     
## obsLevels        2 -none-     character
## param            0 -none-     list
coefficients <- coef(fit2.HigherComplexity.Lasso$finalModel, fit2.HigherComplexity.Lasso$bestTune$lambda)
# coefficients@Dimnames[[1]][which(coefficients > .0001 |coefficients < -.0001 )]

# Train Confusion Matrix
plotConfusionMatrixByThreshold(fit2.HigherComplexity.Lasso, CaseStudy2.dtrain, "Train Confusion Matrix Stats - Lasso")

threshold <- .5
print("Train Confusion Matrix")
## [1] "Train Confusion Matrix"
print(paste("Threshold was:", threshold))
## [1] "Threshold was: 0.5"
confusionMatrixForCustomThreshold(fit2.HigherComplexity.Lasso, CaseStudy2.dtrain, threshold)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Yes  No
##        Yes  77   0
##        No   35 584
##                                           
##                Accuracy : 0.9497          
##                  95% CI : (0.9308, 0.9647)
##     No Information Rate : 0.8391          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7869          
##                                           
##  Mcnemar's Test P-Value : 9.081e-09       
##                                           
##             Sensitivity : 0.6875          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9435          
##              Prevalence : 0.1609          
##          Detection Rate : 0.1106          
##    Detection Prevalence : 0.1106          
##       Balanced Accuracy : 0.8438          
##                                           
##        'Positive' Class : Yes             
## 
# Test Confusion Matrix
plotConfusionMatrixByThreshold(fit2.HigherComplexity.Lasso, CaseStudy2.dtest, "Test Confusion Matrix Stats - Lasso")

print("Test Confusion Matrix")
## [1] "Test Confusion Matrix"
print(paste("Threshold was:", threshold))
## [1] "Threshold was: 0.5"
confusionMatrixForCustomThreshold(fit2.HigherComplexity.Lasso, CaseStudy2.dtest, threshold)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Yes  No
##        Yes   8   4
##        No   20 142
##                                           
##                Accuracy : 0.8621          
##                  95% CI : (0.8018, 0.9096)
##     No Information Rate : 0.8391          
##     P-Value [Acc > NIR] : 0.2386          
##                                           
##                   Kappa : 0.3359          
##                                           
##  Mcnemar's Test P-Value : 0.0022          
##                                           
##             Sensitivity : 0.28571         
##             Specificity : 0.97260         
##          Pos Pred Value : 0.66667         
##          Neg Pred Value : 0.87654         
##              Prevalence : 0.16092         
##          Detection Rate : 0.04598         
##    Detection Prevalence : 0.06897         
##       Balanced Accuracy : 0.62916         
##                                           
##        'Positive' Class : Yes             
## 
# ROC Plot
res <- evalm(fit2.HigherComplexity.Lasso,gnames=c('Train'), title='ROC: Logistic Regression-LASSO') 
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 696
## Number of groups: 1
## Observations per group: 696
## Positive: No
## Negative: Yes
## Group: Train
## Positive: 584
## Negative: 112
## ***Performance Metrics***

## Train Optimal Informedness = 0.500978473581213
## Train AUC-ROC = 0.81

#Train 
res <- evalm(fit2.HigherComplexity.Lasso,gnames=c('Train'), title='Train ROC: Logistic Regression-LASSO') 
## ***MLeval: Machine Learning Model Evaluation***
## Input: caret train function object
## Not averaging probs.
## Group 1 type: cv
## Observations: 696
## Number of groups: 1
## Observations per group: 696
## Positive: No
## Negative: Yes
## Group: Train
## Positive: 584
## Negative: 112
## ***Performance Metrics***

## Train Optimal Informedness = 0.500978473581213
## Train AUC-ROC = 0.81

#Test 
pred2 <- predict(fit2.HigherComplexity.Lasso, newdata = CaseStudy2.dtest, type="prob")
test1 <- evalm(data.frame(pred2, CaseStudy2.dtest$Attrition),  title='Test ROC: Logistic Regression-LASSO')
## ***MLeval: Machine Learning Model Evaluation***
## Input: data frame of probabilities of observed labels
## Group does not exist, making column.
## Observations: 174
## Number of groups: 1
## Observations per group: 174
## Positive: No
## Negative: Yes
## Group: Group1
## Positive: 146
## Negative: 28
## ***Performance Metrics***

## Group1 Optimal Informedness = 0.487279843444227
## Group1 AUC-ROC = 0.78

summary(fit2.HigherComplexity.Lasso)
##             Length Class      Mode     
## a0             100 -none-     numeric  
## beta        227100 dgCMatrix  S4       
## df             100 -none-     numeric  
## dim              2 -none-     numeric  
## lambda         100 -none-     numeric  
## dev.ratio      100 -none-     numeric  
## nulldev          1 -none-     numeric  
## npasses          1 -none-     numeric  
## jerr             1 -none-     numeric  
## offset           1 -none-     logical  
## classnames       2 -none-     character
## call             5 -none-     call     
## nobs             1 -none-     numeric  
## lambdaOpt        1 -none-     numeric  
## xNames        2271 -none-     character
## problemType      1 -none-     character
## tuneValue        2 data.frame list     
## obsLevels        2 -none-     character
## param            0 -none-     list

Regression:

GLM with Lasso. For prediction of Monthly Income use Multiple Linear Regression with Lasso L1-Regularization. Note that the RSME is well below the requirement (~$500). Although we can see some overfitting on the training set, the test prediction is still acceptably low.

ctrl <- trainControl(method="cv", number = 5, search="grid")
fit2.HigherComplexity.Lasso.Regression <- train(MonthlyIncome ~ (.)^2 + I(Age^2) + I(DailyRate^2)+ I(DistanceFromHome^2)+ I(EmployeeNumber^2)+ I(HourlyRate^2)+ I(MonthlyIncome^2)+ I(MonthlyRate^2)+ I(NumCompaniesWorked^2)+ I(PercentSalaryHike^2) + I(TotalWorkingYears^2) + I(YearsAtCompany^2), data = CaseStudy2.dtrain, method = 'glmnet', trControl = ctrl, tuneGrid = data.frame(alpha =  1, lambda = 10^seq(-2, -1.75, by = 0.0001)))

summary(fit2.HigherComplexity.Lasso.Regression)
##             Length Class      Mode     
## a0             100 -none-     numeric  
## beta        227100 dgCMatrix  S4       
## df             100 -none-     numeric  
## dim              2 -none-     numeric  
## lambda         100 -none-     numeric  
## dev.ratio      100 -none-     numeric  
## nulldev          1 -none-     numeric  
## npasses          1 -none-     numeric  
## jerr             1 -none-     numeric  
## offset           1 -none-     logical  
## call             5 -none-     call     
## nobs             1 -none-     numeric  
## lambdaOpt        1 -none-     numeric  
## xNames        2271 -none-     character
## problemType      1 -none-     character
## tuneValue        2 data.frame list     
## obsLevels        1 -none-     logical  
## param            0 -none-     list
coefficients <- coef(fit2.HigherComplexity.Lasso.Regression$finalModel, fit2.HigherComplexity.Lasso.Regression$bestTune$lambda)

train_prediction=predict(fit2.HigherComplexity.Lasso.Regression, CaseStudy2.dtrain)
train_ASE<-mean((CaseStudy2.dtrain$MonthlyIncome-train_prediction)^2)
print("Train RSME")
## [1] "Train RSME"
(train_ASE)^0.5
## [1] 483.6119
test_prediction=predict(fit2.HigherComplexity.Lasso.Regression, CaseStudy2.dtest)
test_ASE<-mean((CaseStudy2.dtest$MonthlyIncome-test_prediction)^2)
print("Test RSME")
## [1] "Test RSME"
(test_ASE)^0.5
## [1] 495.8135
print("Use this method! RSME is acceptable!")
## [1] "Use this method! RSME is acceptable!"

KNN

Note Caret selects (via Cross-validation) k=5 as the optimal k-value (using RSME as the metric and cross-validation). Unfortunately, BOTH the Train and Test RSME are BAD! This is likely because we’ve only used numerical variables in our model (since they are the only ones that work for KNN)

reduced.numerical.train <- CaseStudy2.dtrain %>% dplyr::select(Age, DailyRate, DistanceFromHome, EmployeeNumber, HourlyRate, MonthlyIncome, MonthlyRate, NumCompaniesWorked, PercentSalaryHike, TotalWorkingYears, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager)

reduced.numerical.test <- CaseStudy2.dtest %>% dplyr::select(Age, DailyRate, DistanceFromHome, EmployeeNumber, HourlyRate, MonthlyIncome, MonthlyRate, NumCompaniesWorked, PercentSalaryHike, TotalWorkingYears, YearsAtCompany, YearsInCurrentRole, YearsSinceLastPromotion, YearsWithCurrManager)

ctrl <- trainControl(method="repeatedcv",repeats = 5) 
knnFit <- train(MonthlyIncome ~ ., data = reduced.numerical.train, method = "knn", trControl = ctrl, preProcess = c("center","scale"), tuneLength = 20)

train_prediction=predict(knnFit, reduced.numerical.train)
train_ASE<-mean((reduced.numerical.train$MonthlyIncome-train_prediction)^2)
print("Train RSME")
## [1] "Train RSME"
(train_ASE)^0.5
## [1] 2944.824
test_prediction=predict(knnFit, reduced.numerical.test)
test_ASE<-mean((reduced.numerical.test$MonthlyIncome-test_prediction)^2)
print("Test RSME")
## [1] "Test RSME"
(test_ASE)^0.5
## [1] 3401.035
print("RSME is BAD for KNN!!! DON'T USE Regression")
## [1] "RSME is BAD for KNN!!! DON'T USE Regression"